(flights %>%
filter(
arr_delay >= 200 # Had an arrival delay of two or more hours
))
## # A tibble: 2,832 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 848 1835 853 1001
## 2 2013 1 1 1815 1325 290 2120
## 3 2013 1 1 1842 1422 260 1958
## 4 2013 1 1 2006 1630 216 2230
## 5 2013 1 1 2115 1700 255 2330
## 6 2013 1 1 2205 1720 285 46
## 7 2013 1 1 2343 1724 379 314
## 8 2013 1 2 1244 900 224 1431
## 9 2013 1 2 1332 904 268 1616
## 10 2013 1 2 1412 838 334 1710
## # … with 2,822 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
(flights %>%
filter(
dest %in% c('IAH', 'HOU') # Flew to Houston
))
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 623 627 -4 933
## 4 2013 1 1 728 732 -4 1041
## 5 2013 1 1 739 739 0 1104
## 6 2013 1 1 908 908 0 1228
## 7 2013 1 1 1028 1026 2 1350
## 8 2013 1 1 1044 1045 -1 1352
## 9 2013 1 1 1114 900 134 1447
## 10 2013 1 1 1205 1200 5 1503
## # … with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
(flights %>%
filter(
carrier %in% c('UA', 'AA', 'DL') # Were operated by United, American, or Delta
))
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 554 600 -6 812
## 5 2013 1 1 554 558 -4 740
## 6 2013 1 1 558 600 -2 753
## 7 2013 1 1 558 600 -2 924
## 8 2013 1 1 558 600 -2 923
## 9 2013 1 1 559 600 -1 941
## 10 2013 1 1 559 600 -1 854
## # … with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
(flights %>%
filter(
month %in% c(7, 8, 9) # Departed in summer
))
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # … with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
(flights %>%
filter(
arr_delay >= 200, dep_delay <= 0 # Arrived mored than two hours late, but didn't leave late
))
## # A tibble: 0 x 19
## # … with 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
(flights %>%
filter(
dep_delay >= 100, dep_delay - arr_delay > 30 # Were delayed by at least an hour, but made up over 30 minutes in flight
))
## # A tibble: 1,056 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 2205 1720 285 46
## 2 2013 1 1 2326 2130 116 131
## 3 2013 1 3 1503 1221 162 1803
## 4 2013 1 3 1941 1759 102 2246
## 5 2013 1 3 2257 2000 177 45
## 6 2013 1 4 1917 1700 137 2135
## 7 2013 1 4 2010 1745 145 2257
## 8 2013 1 4 2058 1730 208 2
## 9 2013 1 4 2100 1920 100 2224
## 10 2013 1 4 2221 1858 203 116
## # … with 1,046 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
(flights %>%
filter(
dep_time %% 2400 <= 600 # Departed between midnight and 6am
))
## # A tibble: 9,373 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # … with 9,363 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
col_name <- colnames(flights)
na_count <- c()
for (i in 1:length(flights)) {
na_count <- c(na_count, sum(is.na(flights[[i]])))
}
(result <- tibble(col_name, na_count) %>% filter(na_count > 0))
## # A tibble: 6 x 2
## col_name na_count
## <chr> <int>
## 1 dep_time 8255
## 2 dep_delay 8255
## 3 arr_time 8713
## 4 arr_delay 9430
## 5 tailnum 2512
## 6 air_time 9430
8,255 flights have a missing dep_time. Missing value in dep_time means that the flight did not departed (the flight was cancelled). Also dep_delay, arr_time, arr_delay, tailnum, air_time have missing values as above.
arrangesorts boolean values in a way that TRUE values come after FALSE values. Thus, giving argument!is.na(COL_NAME)toarrangesorts data in a way that missing values in the column come first.
flights %>%
arrange(!is.na(dep_time))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 NA 1630 NA NA
## 2 2013 1 1 NA 1935 NA NA
## 3 2013 1 1 NA 1500 NA NA
## 4 2013 1 1 NA 600 NA NA
## 5 2013 1 2 NA 1540 NA NA
## 6 2013 1 2 NA 1620 NA NA
## 7 2013 1 2 NA 1355 NA NA
## 8 2013 1 2 NA 1420 NA NA
## 9 2013 1 2 NA 1321 NA NA
## 10 2013 1 2 NA 1545 NA NA
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
## dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <dbl>
## 1 517 515 830 819 227
## 2 533 529 850 830 227
## 3 542 540 923 850 160
## 4 544 545 1004 1022 183
## 5 554 600 812 837 116
## 6 554 558 740 728 150
## 7 555 600 913 854 158
## 8 557 600 709 723 53
## 9 557 600 838 846 140
## 10 558 600 753 745 138
## # … with 336,766 more rows, and 1 more variable: time_hour <dttm>
select(flights, contains("TIME", ignore.case=FALSE))
## # A tibble: 336,776 x 0
dplyr::containsfunction has default setting ofignore.case = TRUE, which let the function ignore cases of the characters. Specifyingignore.case = TRUElet the function be case-sensitive.
time_to_minutes <- function(time) {
time_hour <- time %/% 100
time_minute <- time %% 100
time_hour * 60 + time_minute
}
flights %>%
mutate(
dep_time_minutes = time_to_minutes(dep_time),
sched_dep_time_minutes = time_to_minutes(sched_dep_time)
) %>%
select(dep_time, dep_time_minutes, sched_dep_time, sched_dep_time_minutes)
## # A tibble: 336,776 x 4
## dep_time dep_time_minutes sched_dep_time sched_dep_time_minutes
## <int> <dbl> <int> <dbl>
## 1 517 317 515 315
## 2 533 333 529 329
## 3 542 342 540 340
## 4 544 344 545 345
## 5 554 354 600 360
## 6 554 354 558 358
## 7 555 355 600 360
## 8 557 357 600 360
## 9 557 357 600 360
## 10 558 358 600 360
## # … with 336,766 more rows
flights %>%
transmute(
dep_sched_dep = time_to_minutes(dep_time) - time_to_minutes(sched_dep_time),
dep_delay,
check = dep_sched_dep == dep_delay
)
## # A tibble: 336,776 x 3
## dep_sched_dep dep_delay check
## <dbl> <dbl> <lgl>
## 1 2 2 TRUE
## 2 4 4 TRUE
## 3 2 2 TRUE
## 4 -1 -1 TRUE
## 5 -6 -6 TRUE
## 6 -4 -4 TRUE
## 7 -5 -5 TRUE
## 8 -3 -3 TRUE
## 9 -3 -3 TRUE
## 10 -2 -2 TRUE
## # … with 336,766 more rows
It’s easy to know that
dep_time-sched_dep_time=dep_delay.
flights %>%
group_by(carrier) %>%
summarize(
worst_dep_delay = max(dep_delay, na.rm = TRUE),
worst_arr_delay = max(arr_delay, na.rm = TRUE)
) %>%
arrange(desc(worst_dep_delay), desc(worst_arr_delay))
## # A tibble: 16 x 3
## carrier worst_dep_delay worst_arr_delay
## <chr> <dbl> <dbl>
## 1 HA 1301 1272
## 2 MQ 1137 1127
## 3 AA 1014 1007
## 4 DL 960 931
## 5 F9 853 834
## 6 9E 747 744
## 7 VX 653 676
## 8 FL 602 572
## 9 EV 548 577
## 10 B6 502 497
## 11 US 500 492
## 12 UA 483 455
## 13 WN 471 453
## 14 YV 387 381
## 15 AS 225 198
## 16 OO 154 157
Carrier
HAhas both the worst dep_delayed and the worst arr_delayed flights ever.
flights %>%
group_by(carrier) %>%
summarize(
mean_dep_delay = mean(dep_delay, na.rm = TRUE),
mean_arr_delay = mean(arr_delay, na.rm = TRUE)
) %>%
arrange(desc(mean_dep_delay), desc(mean_arr_delay))
## # A tibble: 16 x 3
## carrier mean_dep_delay mean_arr_delay
## <chr> <dbl> <dbl>
## 1 F9 20.2 21.9
## 2 EV 20.0 15.8
## 3 YV 19.0 15.6
## 4 FL 18.7 20.1
## 5 WN 17.7 9.65
## 6 9E 16.7 7.38
## 7 B6 13.0 9.46
## 8 VX 12.9 1.76
## 9 OO 12.6 11.9
## 10 UA 12.1 3.56
## 11 MQ 10.6 10.8
## 12 DL 9.26 1.64
## 13 AA 8.59 0.364
## 14 AS 5.80 -9.93
## 15 HA 4.90 -6.92
## 16 US 3.78 2.13
In average, carrier
F9is the worst both in departure and arrival.
We can disentangle the effects of bad airports by normalizing the delays of flights by the mean delays of each airport.
flights %>%
filter(!is.na(dep_delay)) %>%
group_by(origin) %>%
mutate(
airport_delay_mean = mean(dep_delay),
airport_delay_std = var(dep_delay)^(1/2),
normal_dep_delay = (dep_delay - airport_delay_mean)/airport_delay_std
) %>%
group_by(carrier) %>%
summarize(
delay = mean(normal_dep_delay)
) %>%
arrange(desc(delay))
## # A tibble: 16 x 2
## carrier delay
## <chr> <dbl>
## 1 F9 0.247
## 2 YV 0.216
## 3 FL 0.210
## 4 EV 0.139
## 5 WN 0.123
## 6 9E 0.120
## 7 OO 0.0304
## 8 B6 0.0191
## 9 VX -0.00251
## 10 MQ -0.0169
## 11 UA -0.0517
## 12 DL -0.0576
## 13 AA -0.0754
## 14 HA -0.185
## 15 US -0.195
## 16 AS -0.225
flights_lagged <- flights %>%
group_by(origin) %>%
arrange(year, month, day, dep_time) %>%
mutate(lagging_delay = lag(dep_delay))
lagged_delay <- flights_lagged %>%
group_by(lagging_delay) %>%
summarize(mean_lagged_delay = mean(dep_delay))
ggplot(lagged_delay, aes(lagging_delay, mean_lagged_delay)) +
geom_point(na.rm = TRUE)
Delay of preceding flight is almost linearly related with the delay of following flight, for up to 250 minutes of preceding delay. From 250 to 500 minutes of preceding delay, the plot implies the positive relationship, but with much bigger variance. From 500 minutes of preceding delay, the following delay seems to be nearly 0.
binwidths = c(5, 10, 30, 50, 100, 200, 500, 1000)
gg_diamonds <- diamonds %>%
filter(price < 10000) %>%
ggplot(aes(x=price))
for (i in 1:length(binwidths)) {
binwidth <- binwidths[[i]]
plot <- gg_diamonds +
geom_histogram(binwidth = binwidth) +
ggtitle(paste('Binwidth: ', binwidth))
print(plot)
}
diamonds %>%
group_by(cut_width(price, 50)) %>%
summarize(n=n())
## # A tibble: 369 x 2
## `cut_width(price, 50)` n
## <fct> <int>
## 1 [325,375] 105
## 2 (375,425] 407
## 3 (425,475] 768
## 4 (475,525] 935
## 5 (525,575] 1275
## 6 (575,625] 1373
## 7 (625,675] 1381
## 8 (675,725] 1503
## 9 (725,775] 1389
## 10 (775,825] 1295
## # … with 359 more rows
There is no diamond with the price between (1475, 1525], which is unusual. Also, there’s a slight increase in the counts of diamonds around from $3,750 to $5,000.
flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min/60
) %>%
ggplot(aes(sched_dep_time, ..density..)) +
geom_freqpoly(aes(color=cancelled), binwidth=1)
The problem was, there are much more non-cancelled flights which made the comparison with counts difficult. In this case, by comparing density, or standardized count the comparison would be more plausible. The graph shows that flights are more tend to be cancelled at the afternoon. (specifically, after 3pm)
vars <- colnames(diamonds)
for (i in 1:length(vars)) {
plot <- diamonds %>%
ggplot(aes(x=diamonds[[ vars[[i]] ]], y=price)) +
geom_point(alpha=1/10) +
xlab(vars[[i]])
print(plot)
}
Seemingly, “x” and “carat” is most important in predicting the price, as they seem to have exponential relationship with the price of the diamond. “y” and “z” also seem to have exponential relationship, but they have too short range. Note that “x” and “carat” both indicates the size of diamonds. We can easily know that “x” and “carat” of a diamond have quite high correlation. Then we can expect that the relationship between “x” and “cut” would be similar to the relationship between “carat” and “cut”.
diamonds %>%
ggplot(aes(x, carat)) +
geom_point()
diamonds %>%
ggplot(aes(cut, carat)) +
geom_boxplot() +
coord_cartesian(ylim=c(0, 2))
Now the graph shows that the more ideal the cut, the smaller diamonds are, which makes ideal diamond cheaper that the fair one.
flights %>%
group_by(dest, month) %>%
mutate(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(dest, month)) +
geom_tile(aes(fill=dep_delay))
There are too many destinations to fit in one graph.
library(d3heatmap)
flights %>%
group_by(month, dest) %>%
filter(n() >= 50) %>%
summarize(avg_delay=mean(dep_delay, na.rm = TRUE)) %>%
spread(dest, avg_delay) %>%
d3heatmap(., Rowv = FALSE, Colv = FALSE)
We can use d3heatmap library instead. It offers interactive graph. Also, we can filter some interesting destinations only.
ggplot(data = diamonds) +
geom_bin2d(mapping = aes(x = x, y = y)) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
ggplot(data = diamonds) +
geom_point(mapping = aes(x = x, y = y)) +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
As a binned plot summarizes various data points in the bin, it is difficult to tell that a specific data point is unusual.
(annoying <- tibble(
`1` = 1:10,
`2` = `1` * 2 + rnorm(length(`1`))
))
## # A tibble: 10 x 2
## `1` `2`
## <int> <dbl>
## 1 1 1.95
## 2 2 3.86
## 3 3 5.93
## 4 4 7.00
## 5 5 9.33
## 6 6 10.7
## 7 7 14.9
## 8 8 17.5
## 9 9 18.0
## 10 10 19.0
# 1.
annoying[, "1"]
## # A tibble: 10 x 1
## `1`
## <int>
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
# 2.
annoying %>%
ggplot(aes(`1`, `2`)) +
geom_point()
# 3.
(annoying <- annoying %>%
mutate("3" = `2`/`1`))
## # A tibble: 10 x 3
## `1` `2` `3`
## <int> <dbl> <dbl>
## 1 1 1.95 1.95
## 2 2 3.86 1.93
## 3 3 5.93 1.98
## 4 4 7.00 1.75
## 5 5 9.33 1.87
## 6 6 10.7 1.78
## 7 7 14.9 2.13
## 8 8 17.5 2.18
## 9 9 18.0 2.00
## 10 10 19.0 1.90
# 4.
annoying %>%
rename("one" = "1", `two` = `2`, three = 3)
## # A tibble: 10 x 3
## one two three
## <int> <dbl> <dbl>
## 1 1 1.95 1.95
## 2 2 3.86 1.93
## 3 3 5.93 1.98
## 4 4 7.00 1.75
## 5 5 9.33 1.87
## 6 6 10.7 1.78
## 7 7 14.9 2.13
## 8 8 17.5 2.18
## 9 9 18.0 2.00
## 10 10 19.0 1.90
# 3rd columns are truncated, because only 2 columns are assigned in header.
read_csv("a,b\n1,2,3\n4,5,6")
## Warning: 2 parsing failures.
## row col expected actual file
## 1 -- 2 columns 3 columns literal data
## 2 -- 2 columns 3 columns literal data
## # A tibble: 2 x 2
## a b
## <dbl> <dbl>
## 1 1 2
## 2 4 5
# 4th columns are truncated, and 3rd column of the first row is NA, because 3 columns are assigned.
read_csv("a,b,c\n1,2\n1,2,3,4")
## Warning: 2 parsing failures.
## row col expected actual file
## 1 -- 3 columns 2 columns literal data
## 2 -- 3 columns 4 columns literal data
## # A tibble: 2 x 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1 2 NA
## 2 1 2 3
# quote is not closed normally.
read_csv("a,b\n\"1")
## Warning: 2 parsing failures.
## row col expected actual file
## 1 a closing quote at end of file literal data
## 1 -- 2 columns 1 columns literal data
## # A tibble: 1 x 2
## a b
## <dbl> <chr>
## 1 1 <NA>
# 1, 2 of first row are saved as character, not integer, because a column should have one data type.
read_csv("a,b\n1,2\na,b")
## # A tibble: 2 x 2
## a b
## <chr> <chr>
## 1 1 2
## 2 a b
# We need to specify delimitor with read_delim() to break the value.
read_csv("a;b\n1;3")
## # A tibble: 1 x 1
## `a;b`
## <chr>
## 1 1;3
# an xlsx file from django model export
orders = readxl::read_excel("./etc/orders.xlsx")
# Locale with date format and timezone setting
django_locale = locale(date_format = "%Y-%m-%d %H:%M:%S", tz="Asia/Seoul")
parse_datetime(orders$created, locale = django_locale)
## [1] "2019-10-10 07:42:09 KST" "2019-10-10 04:35:56 KST"
## [3] "2019-10-10 04:00:58 KST" "2019-10-10 02:19:50 KST"
## [5] "2019-10-10 02:03:38 KST" "2019-10-10 00:08:25 KST"
## [7] "2019-10-08 07:27:06 KST" "2019-10-08 06:09:34 KST"
## [9] "2019-10-08 06:01:30 KST" "2019-10-08 04:16:22 KST"
## [11] "2019-10-08 04:11:52 KST" "2019-10-08 02:28:28 KST"
## [13] "2019-10-08 01:20:54 KST" "2019-10-07 07:51:22 KST"
## [15] "2019-10-07 01:15:57 KST" "2019-10-05 01:31:51 KST"
## [17] "2019-10-04 01:47:21 KST" "2019-10-04 00:52:27 KST"
## [19] "2019-10-03 04:39:21 KST" "2019-10-03 00:07:31 KST"
## [21] "2019-10-02 07:28:18 KST" "2019-10-02 07:04:23 KST"
## [23] "2019-10-02 03:32:26 KST" "2019-10-02 00:01:14 KST"
## [25] "2019-10-01 02:24:38 KST" "2019-10-01 02:03:48 KST"
## [27] "2019-10-01 01:57:54 KST" "2019-10-01 01:40:08 KST"
## [29] "2019-10-01 00:20:51 KST" "2019-09-30 07:18:22 KST"
## [31] "2019-09-30 04:53:14 KST" "2019-09-30 04:24:18 KST"
## [33] "2019-09-30 02:28:05 KST" "2019-09-30 02:13:26 KST"
## [35] "2019-09-30 00:31:26 KST" "2019-09-27 06:42:43 KST"
## [37] "2019-09-27 05:55:33 KST" "2019-09-27 03:51:09 KST"
## [39] "2019-09-27 03:36:24 KST" "2019-09-26 23:59:54 KST"
## [41] "2019-09-26 08:42:26 KST" "2019-09-26 05:08:54 KST"
## [43] "2019-09-26 04:11:57 KST" "2019-09-26 02:39:22 KST"
## [45] "2019-09-26 00:24:39 KST" "2019-09-25 06:45:46 KST"
## [47] "2019-09-25 02:12:37 KST" "2019-09-25 00:52:57 KST"
## [49] "2019-09-24 01:42:33 KST" "2019-09-23 06:39:42 KST"
## [51] "2019-09-23 06:05:55 KST"
d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014
t1 <- "1705"
t2 <- "11:15:10.12 PM"
parse_date(d1, "%B %d, %Y")
## [1] "2010-01-01"
parse_date(d2, "%Y-%b-%d")
## [1] "2015-03-07"
parse_date(d3, "%d-%b-%Y")
## [1] "2017-06-06"
parse_date(d4, "%B %d (%Y)")
## [1] "2015-08-19" "2015-07-01"
parse_date(d5, "%m/%d/%y")
## [1] "2014-12-30"
parse_time(t1, "%H%M")
## 17:05:00
parse_time(t2, "%H:%M:%OS %p")
## 23:15:10.12
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
options("getSymbols.warning4.0" = FALSE)
samsung_xts <- getSymbols("005930.KS", auto.assign = FALSE)
## Warning: 005930.KS contains missing values. Some functions will not work if
## objects contain missing values in the middle of the series. Consider using
## na.omit(), na.approx(), na.fill(), etc to remove or replace them.
library(timetk)
samsung_tbl <- timetk::tk_tbl(samsung_xts, rename_index = "date")
head(samsung_tbl)
## # A tibble: 6 x 7
## date `005930.KS.Open` `005930.KS.High` `005930.KS.Low`
## <date> <dbl> <dbl> <dbl>
## 1 2007-01-02 12400 12540 12320
## 2 2007-01-03 12540 12560 12220
## 3 2007-01-04 12220 12240 12060
## 4 2007-01-05 12160 12180 11900
## 5 2007-01-08 11840 11880 11580
## 6 2007-01-09 11740 11880 11640
## # … with 3 more variables: `005930.KS.Close` <dbl>,
## # `005930.KS.Volume` <dbl>, `005930.KS.Adjusted` <dbl>
We can convert xts object to tibble easily using library
timetk.
ggplot(samsung_tbl, aes(date, `005930.KS.Close`)) +
geom_line()
quantmod::getFX("USD/KRW")
## [1] "USD/KRW"
quantmod::getFX("JPY/KRW")
## [1] "JPY/KRW"
jpykrw <- tk_tbl(JPYKRW, rename_index = "date")
usdkrw <- tk_tbl(USDKRW, rename_index = "date")
samsung_adjusted <- samsung_tbl %>%
inner_join(jpykrw) %>%
inner_join(usdkrw) %>%
filter(!is.na(`005930.KS.Close`)) %>%
mutate(
in_krw = `005930.KS.Close`,
in_jpy = in_krw/JPY.KRW,
in_usd = in_krw/USD.KRW
)
## Joining, by = "date"
## Joining, by = "date"
initial_krw <- samsung_adjusted$in_krw[1]
initial_jpy <- samsung_adjusted$in_jpy[1]
initial_usd <- samsung_adjusted$in_usd[1]
samsung_adjusted %>%
mutate(
in_jpy_adj = in_jpy * (initial_krw / initial_jpy),
in_usd_adj = in_usd * (initial_krw / initial_usd)
) %>%
ggplot(aes(date)) +
geom_line(aes(y = in_krw, color="black")) +
geom_line(aes(y = in_jpy_adj, color="red")) +
geom_line(aes(y = in_usd_adj, color="blue")) +
scale_color_discrete(
name = "Currency",
labels = c("KRW", "JPY", "USD")
)
Using
inner_join, we can extract only the common date points between the stock price data and the exchange rate data.
(census <- read_csv("./etc/census.csv"))
## Warning: Duplicated column names deduplicated: '2018' => '2018_1' [4],
## '2018' => '2018_2' [5], '2018' => '2018_3' [6], '2018' => '2018_4' [7],
## '2018' => '2018_5' [8], '2018' => '2018_6' [9], '2018' => '2018_7' [10]
## Parsed with column specification:
## cols(
## `행정구역별(읍면동)` = col_character(),
## 연령별 = col_character(),
## `2018` = col_character(),
## `2018_1` = col_character(),
## `2018_2` = col_character(),
## `2018_3` = col_character(),
## `2018_4` = col_character(),
## `2018_5` = col_character(),
## `2018_6` = col_character(),
## `2018_7` = col_character()
## )
## # A tibble: 305 x 10
## `행정구역별(읍면동)`… 연령별 `2018` `2018_1` `2018_2` `2018_3` `2018_4`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 행정구역별(읍면동)… 연령별 총인구(명… 총인구_남자(… 총인구_여자(… 총인구_성비… 내국인(명)…
## 2 서울특별시 합계 9,673… 4,718,9… 4,954,9… 95.2 9,299,5…
## 3 종로구 합계 157,9… 77,147 80,820 95.5 145,911
## 4 중구 합계 129,7… 62,975 66,822 94.2 117,809
## 5 용산구 합계 226,9… 110,685 116,253 95.2 207,812
## 6 성동구 합계 306,7… 150,623 156,173 96.4 295,489
## 7 광진구 합계 362,3… 175,762 186,542 94.2 342,575
## 8 동대문구 합계 358,1… 176,323 181,818 97.0 338,293
## 9 중랑구 합계 391,6… 193,599 198,069 97.7 385,082
## 10 성북구 합계 438,7… 211,204 227,530 92.8 424,211
## # … with 295 more rows, and 3 more variables: `2018_5` <chr>,
## # `2018_6` <chr>, `2018_7` <chr>
Original data
col_names = c(
"행정구역별(읍면동)",
"연령별",
"총인구(명)",
"총인구_남자(명)",
"총인구_여자(명)",
"총인구_성비",
"내국인(명)",
"내국인_남자(명)",
"내국인_여자(명)",
"내국인_성비"
)
seoul2018 <- read_csv(
"./etc/census.csv",
skip = 3,
n_max = 25,
col_names = col_names
)
## Parsed with column specification:
## cols(
## `행정구역별(읍면동)` = col_character(),
## 연령별 = col_character(),
## `총인구(명)` = col_number(),
## `총인구_남자(명)` = col_number(),
## `총인구_여자(명)` = col_number(),
## 총인구_성비 = col_double(),
## `내국인(명)` = col_number(),
## `내국인_남자(명)` = col_number(),
## `내국인_여자(명)` = col_number(),
## 내국인_성비 = col_double()
## )
seoul2018
## # A tibble: 25 x 10
## `행정구역별(읍면동)`… 연령별 `총인구(명)` `총인구_남자(명)`… `총인구_여자(명)`…
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 종로구 합계 157967 77147 80820
## 2 중구 합계 129797 62975 66822
## 3 용산구 합계 226938 110685 116253
## 4 성동구 합계 306796 150623 156173
## 5 광진구 합계 362304 175762 186542
## 6 동대문구 합계 358141 176323 181818
## 7 중랑구 합계 391668 193599 198069
## 8 성북구 합계 438734 211204 227530
## 9 강북구 합계 309138 150028 159110
## 10 도봉구 합계 328243 159569 168674
## # … with 15 more rows, and 5 more variables: 총인구_성비 <dbl>,
## # `내국인(명)` <dbl>, `내국인_남자(명)` <dbl>, `내국인_여자(명)` <dbl>,
## # 내국인_성비 <dbl>
I skipped first 3 columns so that the peripheral information is gone and default column types are set appropriately, but which made me manually set the column names. Also with
n_maxoption I saved only 25 rows from the data source.
seoul2018tidy <- seoul2018 %>%
select(-2)
seoul2018tidy
## # A tibble: 25 x 9
## `행정구역별(읍면동)`… `총인구(명)` `총인구_남자(명)`… `총인구_여자(명)`…
## <chr> <dbl> <dbl> <dbl>
## 1 종로구 157967 77147 80820
## 2 중구 129797 62975 66822
## 3 용산구 226938 110685 116253
## 4 성동구 306796 150623 156173
## 5 광진구 362304 175762 186542
## 6 동대문구 358141 176323 181818
## 7 중랑구 391668 193599 198069
## 8 성북구 438734 211204 227530
## 9 강북구 309138 150028 159110
## 10 도봉구 328243 159569 168674
## # … with 15 more rows, and 5 more variables: 총인구_성비 <dbl>,
## # `내국인(명)` <dbl>, `내국인_남자(명)` <dbl>, `내국인_여자(명)` <dbl>,
## # 내국인_성비 <dbl>
The second column, whose value for all rows is “합계” is useless. So I removed the column. Except for that the data is tidy, with rows meaning the data of each district, columns meaning the values indicated by column names.
Already done by default col_types option of
read_csv. Parsing a numerical vector can be done like:
locale = locale(grouping_mark = ",")
parse_number("192,014,152", locale = locale)
## [1] 192014152
pops <- seoul2018tidy[[2]]
(pops_n <- length(pops))
## [1] 25
(pops_mean <- mean(pops))
## [1] 386957.4
(pops_std <- var(pops)^(1/2))
## [1] 121146.4
random_normal_sample <- rnorm(pops_n, pops_mean, pops_std)
hist(pops)
hist(random_normal_sample)
Random normal sample looks more like bell shape, in which the data points are concentrated and are symmetric around the center(mean). The histogram of pops show the distribution in which the data points are concentrated arount the center, but more data points are put on the right side of the mean. Reference: https://www.datamentor.io/r-programming/examples/random-number/
pops_mean
## [1] 386957.4
sample(pops, 10, replace=TRUE) %>% mean()
## [1] 393771
sample(pops, 10, replace=TRUE) %>% mean()
## [1] 410301.9
They are all different. That’s because the sample mean is a random variable.
pops_mean
## [1] 386957.4
replicate(100, sample(pops, 10, replace=TRUE) %>% mean()) %>% mean
## [1] 389497.5
replicate(1000, sample(pops, 10, replace=TRUE) %>% mean()) %>% mean
## [1] 387100
replicate(10000, sample(pops, 10, replace=TRUE) %>% mean()) %>% mean
## [1] 386985.9
The average of sample mean gets more closer to the population mean as the number of replications gets larger. This phenomenon is called the law of large numbers, which shows that the sample mean is consistent estimator of the population mean.
pops_mean
## [1] 386957.4
sample(pops, 10, replace=TRUE) %>% mean()
## [1] 374167
sample(pops, 25, replace=TRUE) %>% mean()
## [1] 395694.4
sample(pops, 100, replace=TRUE) %>% mean()
## [1] 399757.6
The numbers get closer to the population mean.
replicate(10000, sample(pops, 10, replace=TRUE) %>% mean()) %>%
hist()
replicate(10000, sample(pops, 25, replace=TRUE) %>% mean()) %>%
hist()
replicate(10000, sample(pops, 100, replace=TRUE) %>% mean()) %>%
hist()
The histogram more looks like normal distribution than
hist(pops). This phenomenon is called Central Limit Theorem, which states that the sample mean shows the normal distribution, N(mean, var/n).
replicate(1000, sample(pops, 10, replace=TRUE) %>% mean()) %>%
quantile(probs = seq(0, 1, 0.025))
## 0% 2.5% 5% 7.5% 10% 12.5% 15% 17.5%
## 224261.4 319930.1 328155.8 335329.0 340621.4 345162.5 350103.1 354434.0
## 20% 22.5% 25% 27.5% 30% 32.5% 35% 37.5%
## 356698.6 360396.3 362711.7 365394.5 367577.4 370471.6 374101.8 377307.7
## 40% 42.5% 45% 47.5% 50% 52.5% 55% 57.5%
## 379237.7 381629.3 384364.0 386994.1 388890.7 390537.6 392605.6 394408.3
## 60% 62.5% 65% 67.5% 70% 72.5% 75% 77.5%
## 396037.2 398994.7 402590.4 405861.1 408681.2 411167.9 413522.9 416760.9
## 80% 82.5% 85% 87.5% 90% 92.5% 95% 97.5%
## 420091.7 422978.2 426715.3 430780.1 435252.8 441631.9 448660.1 460804.7
## 100%
## 503071.7
204,885 is less the value of 2.5% quantile, which means the mean of 204,885 is extremely unlikely in the Seoul’s population data. That is, the probability that the sample mean is more far from the population mean than 204,885 is less than 5%. So I don’t think the data is from Seoul’s population.